Data loading

Load required packages

library(monocle)
library(igraph)
library(ComplexHeatmap)

library(Matrix)
library(tidyverse)
library(magrittr)
library(scales)
library(gridExtra)
library(extrafont)
library(ggrepel)

Read in expression matrix

data_directory <- "../../data/10x/"

expr_readcount_use <-
    sparseMatrix(i = c(readRDS(file.path(data_directory, "expr_readcount_raw_csc_indices_part1.rds")),
                       readRDS(file.path(data_directory, "expr_readcount_raw_csc_indices_part2.rds"))),
                 p = readRDS(file.path(data_directory, "expr_readcount_raw_csc_indptr.rds")),
                 x = readRDS(file.path(data_directory, "expr_readcount_raw_csc_values.rds")),
                 dims = readRDS(file.path(data_directory, "expr_readcount_raw_csc_shape.rds")),
                 dimnames = readRDS(file.path(data_directory, "expr_readcount_raw_csc_dimnames.rds")),
                 index1 = FALSE)
dim(expr_readcount_use)
## [1] 27999 34564

Load pseudotime reconstruction result

data_directory <- file.path(data_directory,
                            "epicardial_3f_d7")

cell_dataset <-
    readRDS(file = file.path(data_directory,
                             "cell_dataset_lowerDetectionLimit0.5_DDRTree_dim2_reverse.rds"))

Load gene info

gene_symbols <- read.table(file.path("../..",
                                     "data",
                                     "misc",
                                     "genes.tsv"),
                           header = FALSE,
                           row.names = 1,
                           stringsAsFactors = FALSE)
gene_symbols <- setNames(gene_symbols[[1]], rownames(gene_symbols))
head(gene_symbols)
## ENSMUSG00000051951 ENSMUSG00000089699 ENSMUSG00000102343 
##             "Xkr4"           "Gm1992"          "Gm37381" 
## ENSMUSG00000025900 ENSMUSG00000109048 ENSMUSG00000025902 
##              "Rp1"              "Rp1"            "Sox17"

Load functions

analyze_gene_enrichment <- function(expr,
                                    cell_group_a,
                                    cell_group_b,
                                    gene_symbols = NULL,
                                    expr_cpm = NULL) {

    m <- Matrix::rowSums(expr[, cell_group_b] > 0)
    m1 <- m
    m1[m == 0] <- 1

    n <- Matrix::rowSums(expr[, cell_group_a] > 0)

    pv1 <- pbinom(n, length(cell_group_a), m1 / length(cell_group_b),
                  lower.tail = FALSE) + dbinom(n, length(cell_group_a),
                                               m1 / length(cell_group_b))

    expressed_ratio_log_fold <-
        log(n * length(cell_group_b) / (m * length(cell_group_a)), base = 2)

    d1 <- data.frame(log2.effect = expressed_ratio_log_fold,
                     pval = pv1)

    d1$pos.frac.a <- Matrix::rowMeans(expr[, cell_group_a] > 0)
    d1$pos.frac.b <- Matrix::rowMeans(expr[, cell_group_b] > 0)

    if (! is.null(gene_symbols)) {
        d1$symbol <- gene_symbols[rownames(d1)]
    }

    if (! is.null(expr_cpm)) {
        d1$mean.cpm.a <- Matrix::rowMeans(expr_cpm[rownames(d1), cell_group_a])
        d1$mean.cpm.b <- Matrix::rowMeans(expr_cpm[rownames(d1), cell_group_b])
    }

    d1$p.adj <- p.adjust(d1$pval, method = "BH")
    return(d1)
}

extract_monocle_result <- function(cell_dataset,
                                   x = 1,
                                   y = 2) {

    reduced_dim_coords <- monocle::reducedDimK(cell_dataset)

    ica_space_df <- Matrix::t(reduced_dim_coords) %>% as.data.frame %>%
        select(prin_graph_dim_1 = x,
               prin_graph_dim_2 = y) %>%
        mutate(sample_name = rownames(.), sample_state = rownames(.))

    mst_branch_nodes <-
        cell_dataset@auxOrderingData[[
            cell_dataset@dim_reduce_type]]$branch_points
    branch_point_df <- ica_space_df %>%
        slice(match(mst_branch_nodes, sample_name)) %>%
        mutate(branch_point_idx = seq_len(n()))

    dp_mst <- monocle::minSpanningTree(cell_dataset)

    edge_df <-
        dp_mst %>% igraph::as_data_frame %>%
        select(source = "from", target = "to") %>%
        left_join(ica_space_df %>%
                      select(source = "sample_name",
                             source_prin_graph_dim_1 = "prin_graph_dim_1",
                             source_prin_graph_dim_2 = "prin_graph_dim_2"),
                  by = "source") %>%
        left_join(ica_space_df %>%
                      select(target = "sample_name",
                             target_prin_graph_dim_1 = "prin_graph_dim_1",
                             target_prin_graph_dim_2 = "prin_graph_dim_2"),
                  by = "target")

    sample_state <- pData(cell_dataset)$State
    lib_info_with_pseudo <- pData(cell_dataset)

    data_df <- t(monocle::reducedDimS(cell_dataset)) %>%
        as.data.frame %>%
        select(data_dim_1 = x, data_dim_2 = y) %>%
        rownames_to_column("sample_name") %>%
        mutate(sample_state) %>%
        left_join(lib_info_with_pseudo %>% rownames_to_column("sample_name"),
                  by = "sample_name")

    return(setNames(list(data_df = data_df,
                         edge_df = edge_df,
                         branch_point_df = branch_point_df),
                    c("data", "edge", "brach_point")))
}

plot_pseudotime_batch <- function(batch,
                                  text,
                                  monocle_out) {

    text_use <- text

    p <-
        ggplot(data = subset(monocle_out[[1]], ! category %in% batch),
               aes(x = data_dim_1,
                   y = data_dim_2)) +
        geom_segment(aes_string(x = "source_prin_graph_dim_1",
                                y = "source_prin_graph_dim_2",
                                xend = "target_prin_graph_dim_1",
                                yend = "target_prin_graph_dim_2"),
                     size = .25,
                     linetype = "solid", na.rm = TRUE,
                     color = "#2196F3",
                     data = monocle_out[[2]]) +
        geom_point(size = .3, stroke = 0, shape = 16, alpha = 1,
                   color = "grey70") +
        geom_point(data = subset(monocle_out[[1]],
                                 category %in% batch),
                   color = "#FF5722",
                   size = .3, stroke = 0, shape = 16, alpha = 1) +
        labs(x = "Dimension 1",
             y = "Dimension 2",
             color = NULL) +
        theme_void() +
        theme(legend.justification = c(0, 1),
              # legend.position = c(.15, .4),
              legend.position = c(.19, .4),
              legend.text = element_text(family = "Arial",
                                         size = 4,
                                         margin = margin(t = 0, r = 0,
                                                         b = 0, l = -1.8,
                                                         unit = "mm")),
              legend.key.size = unit(1.5, "mm"),
              legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "mm")) +
        annotate("text",
                 x = -6,
                 y = 4.5,
                 family = "Arial",
                 label = text_use,
                 color = "black",
                 size = 2.5,
                 vjust = "inward", hjust = "inward")

    return(p)
}

plot_expr_pseudotime_batch <- function(gene,
                                       batch,
                                       text,
                                       expr_cpm,
                                       monocle_out) {

    gene_use <- gene
    text_use <- text

    monocle_out[[1]]$expr <-
        log10(expr_cpm[gene_use, monocle_out[[1]]$sample_name] + 1)

    p <-
        ggplot(data = subset(monocle_out[[1]], ! category %in% batch),
               aes(x = data_dim_1,
                   y = data_dim_2)) +
        geom_segment(aes_string(x = "source_prin_graph_dim_1",
                                y = "source_prin_graph_dim_2",
                                xend = "target_prin_graph_dim_1",
                                yend = "target_prin_graph_dim_2"),
                     size = .25,
                     linetype = "solid", na.rm = TRUE,
                     color = "#2196F3",
                     data = monocle_out[[2]]) +
        geom_point(size = .3, stroke = 0, shape = 16, alpha = 1,
                   color = "grey70") +
        geom_point(data = subset(monocle_out[[1]],
                                 category %in% batch)[order(subset(monocle_out[[1]],
                                                            category %in% batch)$expr),],
            aes(color = expr),
            size = .3, stroke = 0, shape = 16, alpha = 1) +
        scale_color_viridis_c(name = NULL) +
        theme_void() +
        theme(legend.justification = c(0, 1),
              # legend.position = c(.15, .4),
              legend.position = c(.19, .4),
              legend.text = element_text(family = "Arial",
                                         size = 4,
                                         margin = margin(t = 0, r = 0,
                                                         b = 0, l = -1.8,
                                                         unit = "mm")),
              legend.key.size = unit(1.5, "mm"),
              legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "mm")) +
        annotate("text",
                 x = -6,
                 y = 4.5,
                 family = "Arial",
                 label = text_use,
                 color = "black",
                 size = 2.5,
                 vjust = "inward", hjust = "inward")

    return(p)
}

Construction of single cell reprogramming trajectory

Load Monocle result and calculate CPM

monocle_out <- extract_monocle_result(cell_dataset)

expr_cpm_use <- expr_readcount_use
expr_cpm_use@x <- 1000000 *
    (expr_cpm_use@x / rep.int(Matrix::colSums(expr_cpm_use),
                              diff(expr_cpm_use@p)))

Visualization of reprogramming trajectory

# generate cell state labels
state_labels <- monocle_out[[1]] %>%
    group_by(State) %>%
    summarise(x = mean(data_dim_1),
              y = mean(data_dim_2)) %>% as.data.frame

p_pseudotime <-
    ggplot(data = subset(monocle_out[[1]]),
       aes(x = data_dim_1,
           y = data_dim_2)) +
    geom_segment(aes_string(x = "source_prin_graph_dim_1",
                            y = "source_prin_graph_dim_2",
                            xend = "target_prin_graph_dim_1",
                            yend = "target_prin_graph_dim_2"),
                 size = .25,
                 linetype = "solid", na.rm = TRUE,
                 color = "#2196F3",
                 data = monocle_out[[2]]) +
    geom_point(aes(color = Pseudotime),
               size = .3, stroke = 0, shape = 16, alpha = .2,
               data = monocle_out[[1]]) +
    scale_color_viridis_c(name = NULL) +
    theme_void() +
    theme(legend.justification = c(0, 1),
          legend.position = c(.19, .4),
          legend.text = element_text(family = "Arial",
                                     size = 4,
                                     margin = margin(t = 0, r = 0,
                                                     b = 0, l = -1.8,
                                                     unit = "mm")),
          legend.key.size = unit(1.5, "mm"),
          legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "mm")) +
    annotate("text",
             x = -6,
             y = 4.5,
             family = "Arial",
             label = "Pseudotime",
             color = "black",
             size = 2.5,
             vjust = "inward", hjust = "inward")

# add state labels
p_pseudotime <- p_pseudotime +
    geom_point(data = state_labels,
               aes(x, y),
               size = 2.0,
               stroke = 0,
               shape = 22,
               fill = "#E29C36",
               color = "#E29C36") +
    annotate("text",
             x = state_labels$x,
             y = state_labels$y,
             label = LETTERS[as.integer(state_labels$State)],
             size = 1.5,
             family = "Arial",
             color = "black")

p_pseudotime_bl19 <- plot_pseudotime_batch(batch = "BL19",
                                           text = "3F, D7\n",
                                           monocle_out = monocle_out)

p_pseudotime_primary <- plot_pseudotime_batch(batch = c("BL5", "BL6"),
                                           text = "Primary\n",
                                           monocle_out = monocle_out)

p_pseudotime_uninfected <- plot_pseudotime_batch(batch = "BL7",
                                           text = "Uninfected\n",
                                           monocle_out = monocle_out)

p_combined <- grid.arrange(p_pseudotime,
                           p_pseudotime_primary,
                           p_pseudotime_uninfected,
                           p_pseudotime_bl19,
                           nrow = 2)

Distribution of different cells across reprogramming trajectory

# get cell groups
cell_distribution_primary <-
    pData(cell_dataset) %>%
    filter(category %in% c("BL5", "BL6")) %>% group_by(State) %>%
    summarise(primary.count = n())

cell_distribution_uninfected <-
    pData(cell_dataset) %>%
    filter(category %in% c("BL7")) %>% group_by(State) %>%
    summarise(uninfected.count = n())

cell_distribution_10f_d7 <-
    pData(cell_dataset) %>%
    filter(category %in% c("BL8")) %>% group_by(State) %>%
    summarise("10f_d7.count" = n())

cell_distribution_10f_d14 <-
    pData(cell_dataset) %>%
    filter(category %in% c("BL18")) %>% group_by(State) %>%
    summarise("10f_d14.count" = n())

cell_distribution_3f_d7 <-
    pData(cell_dataset) %>%
    filter(category %in% c("BL19")) %>% group_by(State) %>%
    summarise("3f_d7.count" = n())

# prepare data
pseudotime_state_composition <-
    full_join(cell_distribution_primary, cell_distribution_uninfected,
              by = "State") %>%
    full_join(cell_distribution_10f_d7, by = "State") %>%
    full_join(cell_distribution_10f_d14, by = "State") %>%
    full_join(cell_distribution_3f_d7, by = "State") %>%
    arrange(State) %>% replace_na(list(primary.count = 0)) %>%
    select(- State) %>% `t`(.) %>% `/`(rowSums(.)) %>% `t` %>%
    as.data.frame %>%
    mutate(state = rownames(.)) %>%
    gather(key = category,
           value = ratio,
           - state) %>%
    mutate(category = factor(category,
                             levels = c("primary.count",
                                        "3f_d7.count",
                                        "10f_d14.count",
                                        "10f_d7.count",
                                        "uninfected.count")),
           state2 = LETTERS[as.integer(state)])

pseudotime_state_composition <-
    do.call(rbind.data.frame,
            lapply(levels(pseudotime_state_composition$category),
                   function(x) {
                       df_state_composition_ratio <-
                           pseudotime_state_composition[
                               pseudotime_state_composition$category == x, ]

                       df_state_composition_ratio$cum.ratio <-
                           cumsum(df_state_composition_ratio$ratio)

                       df_state_composition_ratio$label.position <-
                           df_state_composition_ratio$cum.ratio -
                           df_state_composition_ratio$ratio / 2

                       df_state_composition_ratio
                   }))

# plot
ggplot(pseudotime_state_composition, aes(x = category,
                                         y = ratio,
                                         fill = state2)) +
    geom_bar(stat = "identity") + coord_flip() +
    scale_fill_manual(name = "State",
                      values = c("#8DD3C7", "#FFFFB3", "#BEBADA",
                                 "#FB8072", "#80B1D3", "#FDB462",
                                 "#B3DE69",
                                 "#F1835C", "#D9D9D9")) +
    scale_x_discrete(name = NULL,
                     labels = c("Epicardial,\nprimary",
                                "3F, D7",
                                "10F, D14",
                                "10F, D7",
                                "Uninfected")) +
    scale_y_continuous(name = "Percentage",
                       labels = scales::percent) +
    theme_bw() +
    theme(axis.text = element_text(size = 6),
          axis.title = element_text(size = 6),
          axis.title.x = element_text(margin = margin(t = 0, r = 0,
                                                      b = 0, l = 0,
                                                      unit = "mm")),
          legend.title = element_text(size = 6),
          legend.text = element_text(size = 4,
                                     margin = margin(t = 0, r = 0,
                                                     b = 0, l = -1,
                                                     unit = "mm")),
          strip.text.x = element_text(size = 4),
          strip.text.y = element_text(size = 6),
          legend.key.size = unit(2.5, "mm"),
          legend.position = "right",
          # legend.box = "vertical",
          legend.direction = "vertical",
          legend.margin = margin(t = 0, r = 0, b = 0, l = -2, unit = "mm")) +
    geom_text(data = subset(pseudotime_state_composition,
                            state2  == "H"),
              aes(y = 1 - label.position,
                  label = paste0(round(subset(pseudotime_state_composition,
                                              state2  == "H")$ratio, 4) * 100,
                                 "%")), size = 2) +
    guides(fill = guide_legend(ncol = 3, byrow = TRUE))

pseudotime_state_composition2 <-
    do.call(rbind.data.frame,
            lapply(levels(pData(cell_dataset)$State), function(x) {
                composition <-
                    as.data.frame(table(pData(cell_dataset)[
                        pData(cell_dataset)$State == x, "category"]))

                names(composition) <- c("category", "count")
                composition$state <- x

                composition <-
                    composition[match(composition$category,
                                      (c("BL7", "BL8",
                                         "BL18", "BL19", "BL5"))), ]

                composition %<>%
                    mutate(percentage = count / sum(composition$count),
                           label = paste0(round(count / sum(composition$count),
                                                3)
                                          * 100, "%"),
                           label.position = cumsum(percentage) - percentage / 2)

                return(composition)
            })) %>% mutate(category = factor(category,
                                             levels = rev(c("BL7", "BL8",
                                                            "BL18", "BL19",
                                                            "BL5"))))

facet_labels <- paste("State",
                      LETTERS[as.integer(unique(pseudotime_state_composition2$state))])
names(facet_labels) <- unique(pseudotime_state_composition2$state)

ggplot(pseudotime_state_composition2,
           aes(x = "", y = percentage, fill = category)) +
    geom_bar(width = 1, stat = "identity") +
    coord_polar("y", start = 0) +
    facet_wrap(~ state,
               labeller = labeller(state = facet_labels)) +
    geom_text(data = pseudotime_state_composition2,
              aes(y = label.position,
                  label = pseudotime_state_composition2$label), size = 1.5) +
    scale_fill_manual(name = NULL,
                      labels = c("Epicardial, primary",
                                 "3F, D7",
                                 "10F, D14",
                                 "10F, D7",
                                 "Uninfected"),
                      values = c("#e37b3e",
                                 "#44b29c",
                                 "#de5a49",
                                 "#f7a8a5",
                                 "#f0ca4e")) +
    theme_bw() +
    theme(axis.text.x = element_blank(),
          axis.title.x = element_blank(),
          axis.title.y = element_blank(),
          # panel.border = element_blank(),
          panel.grid = element_blank(),
          axis.ticks = element_blank(),
          # text = element_text(size = 4),
          legend.text = element_text(size = 4,
                                     margin = margin(t = 0, r = 0,
                                                     b = 0, l = -1.5,
                                                     unit = "mm")),
          strip.text.x = element_text(size = 4),
          strip.text.y = element_text(size = 4),
          legend.key.size = unit(2.5, "mm"),
          legend.margin = margin(t = 0, r = 0, b = 0, l = -3, unit = "mm"))

Visualization of gene expressions

# Wt1
grep("Wt1", gene_symbols, value = TRUE)
## ENSMUSG00000074987 ENSMUSG00000016458 
##            "Wt1os"              "Wt1"
selected_gene <- "ENSMUSG00000016458"
# gene_symbols[selected_gene]

p_pseudotime_bl19_Wt1 <-
    plot_expr_pseudotime_batch(gene = selected_gene,
                               batch = c("BL19"),
                               text = paste(gene_symbols[selected_gene],
                                            "3F, Day 7", sep = "\n"),
                               expr_cpm = expr_cpm_use,
                               monocle_out = monocle_out)


grep("Bnc1", gene_symbols, value = TRUE)
## ENSMUSG00000025105 
##             "Bnc1"
selected_gene <- "ENSMUSG00000025105"
# gene_symbols[selected_gene]

p_pseudotime_bl19_Bnc1 <-
    plot_expr_pseudotime_batch(gene = selected_gene,
                               batch = c("BL19"),
                               text = paste(gene_symbols[selected_gene],
                                            "3F, Day 7", sep = "\n"),
                               expr_cpm = expr_cpm_use,
                               monocle_out = monocle_out)


grep("Myrf", gene_symbols, value = TRUE)
## ENSMUSG00000034057 ENSMUSG00000036098 
##            "Myrfl"             "Myrf"
selected_gene <- "ENSMUSG00000036098"
# gene_symbols[selected_gene]

p_pseudotime_bl19_Myrf <-
    plot_expr_pseudotime_batch(gene = selected_gene,
                               batch = c("BL19"),
                               text = paste(gene_symbols[selected_gene],
                                            "3F, Day 7", sep = "\n"),
                               expr_cpm = expr_cpm_use,
                               monocle_out = monocle_out)


grep("Upk3b", gene_symbols, value = TRUE)
## ENSMUSG00000042985 ENSMUSG00000006143 
##            "Upk3b"           "Upk3bl"
selected_gene <- "ENSMUSG00000042985"
# gene_symbols[selected_gene]

p_pseudotime_bl19_Upk3b <-
    plot_expr_pseudotime_batch(gene = selected_gene,
                               batch = c("BL19"),
                               text = paste(gene_symbols[selected_gene],
                                            "3F, Day 7", sep = "\n"),
                               expr_cpm = expr_cpm_use,
                               monocle_out = monocle_out)


p_combined <- grid.arrange(p_pseudotime_bl19_Wt1,
                           p_pseudotime_bl19_Bnc1,
                           p_pseudotime_bl19_Myrf,
                           p_pseudotime_bl19_Upk3b,
                           nrow = 2)

Enrichment of exogenous factors in 3F-reprogrammed cells

# select genes
genes_selected_10 <- c("ENSMUSG00000016458",
                       "ENSMUSG00000051910",
                       "ENSMUSG00000025105",
                       "ENSMUSG00000031965",
                       "ENSMUSG00000036098",
                       "ENSMUSG00000045680",
                       "ENSMUSG00000026628",
                       "ENSMUSG00000038193",
                       "ENSMUSG00000005836",
                       "ENSMUSG00000032419")

cells_uninfected <-
    pData(cell_dataset) %>% mutate(cell = rownames(.)) %>%
    filter(category %in% c("BL7")) %>%
    .$cell

# Normalize
expr_readcount_norm <- expr_readcount_use

expr_readcount_norm <-
    expr_readcount_norm[, Matrix::colSums(expr_readcount_norm > 0) >= 200]

expr_readcount_norm <-
    expr_readcount_norm[Matrix::rowSums(expr_readcount_norm > 0) >= 30, ]

expr_readcount_norm <-
    expr_readcount_norm[Matrix::rowSums(expr_readcount_norm) >= 60, ]

expr_readcount_norm@x <- median(Matrix::colSums(expr_readcount_norm)) *
    (expr_readcount_norm@x / rep.int(Matrix::colSums(expr_readcount_norm),
                                     diff(expr_readcount_norm@p)))

summary_enrichment_genes_selected <-
    do.call(rbind.data.frame,
            lapply(levels(pData(cell_dataset)$State), function(x) {
                cat("Computing cell state", LETTERS[as.integer(x)], "\n")

                cells_in_selected_group <-
                    rownames(pData(cell_dataset)[pData(cell_dataset)$State == x &
                                                         pData(cell_dataset)$category %in% c("BL19"),])

                y <- analyze_gene_enrichment(expr = expr_readcount_norm,
                                             cell_group_a = cells_in_selected_group,
                                             cell_group_b = cells_uninfected,
                                             gene_symbols = gene_symbols[rownames(expr_readcount_norm)],
                                             expr_cpm = expr_cpm_use)[genes_selected_10, ]

                y$category <- x
                return(y)
            })) %>%
    mutate(p.adj2 = ifelse(-log10(p.adj) == Inf, 400, -log10(p.adj)))
## Computing cell state A 
## Computing cell state B 
## Computing cell state C 
## Computing cell state D 
## Computing cell state E 
## Computing cell state F 
## Computing cell state G 
## Computing cell state H 
## Computing cell state I
facet_labels <- paste("State",
                      LETTERS[as.integer(unique(summary_enrichment_genes_selected$category))])
names(facet_labels) <- unique(summary_enrichment_genes_selected$category)

# plot
ggplot() +
    geom_abline(intercept = 0, slope = 1, linetype = 2) +
    geom_point(data = summary_enrichment_genes_selected,
               aes(pos.frac.b,
                   pos.frac.a,
                   size = p.adj2,
                   color = log2.effect),
                   alpha = .8,
                   stroke = 0, shape = 16) +
    facet_wrap(~ category,
               nrow = 3,
               labeller = labeller(category = facet_labels)) +
    coord_fixed() +
    scale_x_continuous(name = "Expr (%, uninfected cells)",
                       limits = c(0, 1), breaks = seq(0, 1, .2)) +
    scale_y_continuous(name = "Expr (%, indicated state)",
                       limits = c(0, 1), breaks = seq(0, 1, .2)) +
    scale_color_viridis_c(name = expression(paste("Log"[2], " effect"))) +
    theme_bw() +
    labs(size = expression(paste("-log"[10], " (p-value)"))) +
    theme(axis.text = element_text(family = "Arial", size = 7),
          axis.title = element_text(family = "Arial", size = 8),
          strip.text.x = element_text(family = "Arial", size = 8),
          legend.title = element_text(family = "Arial", size = 8),
          legend.text = element_text(family = "Arial", size = 6),
          legend.position = "bottom",
          legend.box = "horizontal",
          legend.key.size = unit(2.5, "mm"),
          legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "mm"),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank()) +
    guides(color = guide_colorbar(order = 1),
           size = guide_legend(order = 2)) +
    geom_text_repel(data = summary_enrichment_genes_selected,
                    aes(pos.frac.b,
                        pos.frac.a,
                        label = summary_enrichment_genes_selected$symbol),
                    size = 2.5,
                    family = "Arial",
                    box.padding = .2,
                    point.padding = .2,
                    nudge_y = .15,
                    arrow = arrow(length = unit(.02, 'npc')),
                    segment.color = "grey35",
                    color = "black")

Exploring gene regulatory network during reprogramming

Extracting gene expression patterns

# get cell groups
cells_reprogr <-
    pData(cell_dataset) %>% mutate(cell = rownames(.)) %>%
    filter(! category %in% c("BL5", "BL6", "BL7")) %>%
    .$cell

cells_bl19 <-
    pData(cell_dataset) %>% mutate(cell = rownames(.)) %>%
    filter(category %in% c("BL19")) %>%
    .$cell

cells_trajectory_ordered <-
    pData(cell_dataset) %>%
    mutate(cell = rownames(.)) %>%
    arrange(Pseudotime) %>% .$cell

cell_states_selected <- c(1, 2, 4, 6, 8)

cells_trajectory_ordered_use <-
    pData(cell_dataset) %>%
    mutate(cell = rownames(.)) %>%
    filter(State %in% cell_states_selected & ! category %in% c("BL5",
                                                               "BL6",
                                                               "BL7")) %>%
    arrange(Pseudotime) %>% .$cell

cells_trajectory_ordered_lst_use <-
    lapply(cell_states_selected, function(x) {

        cells_in_selected_group <-
            rownames(pData(cell_dataset))[
                pData(cell_dataset)$State == x]

        cells_trajectory_ordered_use[
            cells_trajectory_ordered_use %in% cells_in_selected_group
            ]
    })
names(cells_trajectory_ordered_lst_use) <- cell_states_selected

# define function to merge cells
make_cell_groups <- function(cells,
                             number_of_members) {

    if (length(cells) %% number_of_members == 0) {
        mat <- matrix(cells, nrow = number_of_members)
        mat_lst <- lapply(as.list(as.data.frame(mat)), as.character)

    } else {
        cells_remainder <- cells[(length(cells) -
                                      (length(cells) %% number_of_members) + 1) :
                                     length(cells)]
        cells_selected <- cells[! cells %in% cells_remainder]
        mat <- matrix(cells_selected, nrow = number_of_members)

        mat_lst <- lapply(as.list(as.data.frame(mat)), as.character)
        mat_lst[[length(mat_lst)]] <- c(mat_lst[[length(mat_lst)]],
                                        cells_remainder)
    }
    return(mat_lst)
}

# set number of cells to merge
number_of_members <- 20

expr_cpm_avg_cells_trajectory_ordered_lst_use <-
    lapply(cells_trajectory_ordered_lst_use, function(x) {

        cells_lst <- make_cell_groups(x, number_of_members)

        sapply(cells_lst, function(y) {
            Matrix::rowMeans(expr_cpm_use[, y])
            })
    })

num_meta_cells <- lapply(expr_cpm_avg_cells_trajectory_ordered_lst_use,
                         function(x) {dim(x)[2]})

expr_cpm_avg_pseudo_meta_cells <-
    do.call(cbind.data.frame, expr_cpm_avg_cells_trajectory_ordered_lst_use)

colnames(expr_cpm_avg_pseudo_meta_cells) <-
    paste("reprogr", rep(letters[cell_states_selected],
                         num_meta_cells),
          seq_len(ncol(expr_cpm_avg_pseudo_meta_cells)), sep = ".")

kmeans_mat_use <-
    log10(expr_cpm_avg_pseudo_meta_cells[
        str_replace(rownames(readRDS(file.path(data_directory,
                         "de_paired_primary_epicardial_uninfected.rds"))),
                    "_.+$", "")
        , ] + 1)

# select genes to use
pos_frac <- .2
genes_kmeans_use <- rownames(kmeans_mat_use)[rowMeans(kmeans_mat_use > 0)
                                             >= pos_frac]
kmeans_mat_use <- kmeans_mat_use[genes_kmeans_use,]
kmeans_mat_scaled_use <- t(scale(t(kmeans_mat_use)))

num_centers <- 6
seed_kmeans <- 20180706
set.seed(seed_kmeans)
kmeans_out <- kmeans(kmeans_mat_scaled_use,
                     num_centers,
                     iter.max = 10,
                     nstart = 20)

x_breaks <- quantile(seq_len(dim(kmeans_out$centers)[2]),
                     probs = seq(0, 1, 1))

# rename k-means clusters
facet_labels <- paste("Signature", seq_len(num_centers))
names(facet_labels) <- c(6,
                         1,
                         3,
                         5,
                         2,
                         4)

num_genes <- as.data.frame(table(kmeans_out$cluster))
names(num_genes) <- c("cluster", "count")
num_genes$x <- quantile(seq_len(dim(kmeans_out$centers)[2]), .5)
num_genes$y <- 1
num_genes$count2 <- paste("n =", num_genes$count)
sum(num_genes$count)
## [1] 2040
# plot
kmeans_out$centers %>% t %>% as.data.frame %>%
    mutate(meta.cell = rownames(.),
           x = seq_len(nrow(.))) %>%
    gather(key = cluster,
           value = expr,
           - c(meta.cell, x)) %>%
    mutate(cluster = factor(cluster,
                            levels = c(6,
                                       1,
                                       3,
                                       5,
                                       2,
                                       4))) %>%
    ggplot(aes(x, expr)) + geom_line(size = .2) +
    facet_wrap(~ cluster, nrow = 2,
               labeller = labeller(cluster = facet_labels)) +
    labs(title = "Reprogrammed cells",
         y = "Scaled expr (z-score)") +
    scale_x_continuous(name = "Scaled pseudotime (state ABDFH)",
                       breaks = x_breaks) +
    theme_bw() +
    theme(axis.title = element_text(family = "Arial",
                                    size = 8),
          axis.text = element_text(family = "Arial", size = 7),
          axis.text.x = element_text(family = "Arial",
                                     angle = 90,
                                     vjust = .5,
                                     hjust = 1,
                                     size = 7),
          strip.text.x = element_text(family = "Arial",size = 7),
          plot.title = element_text(family = "Arial",
                                    size = 8,
                                    # margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "mm"))
                                    hjust = .5),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank()) +
    geom_text(data = num_genes,
              aes(x, y, label = count2),
              family = "Arial",
              vjust = -.5,
              size = 2.5)

Visualizing gene expression patterns

cells_primary <-
    pData(cell_dataset) %>% mutate(cell = rownames(.)) %>%
    filter(category %in% c("BL5", "BL6")) %>%
    .$cell

cells_primary_pseudotime_ordered <-
    cells_trajectory_ordered[cells_trajectory_ordered %in% cells_primary]

cells_primary_pseudotime_ordered_lst <-
    make_cell_groups(cells = cells_primary_pseudotime_ordered,
                     number_of_members = 20)

expr_cpm_avg_cells_pseudotime_ordered_primary <-
    sapply(cells_primary_pseudotime_ordered_lst, function(y) {
        Matrix::rowMeans(expr_cpm_use[, y])})

colnames(expr_cpm_avg_cells_pseudotime_ordered_primary) <-
    c("primary.1", "primary.2", "primary.3")

expr_cpm_avg_pseudo_meta_cells_reprogr_primary <-
    cbind(expr_cpm_avg_pseudo_meta_cells,
          expr_cpm_avg_cells_pseudotime_ordered_primary)

heatmap_mat_use <- expr_cpm_avg_pseudo_meta_cells_reprogr_primary
heatmap_mat_scaled_use <- t(scale(t(heatmap_mat_use)))
heatmap_mat_scaled_use <- heatmap_mat_scaled_use[genes_kmeans_use,]


column_annotation_complexheatmap_1 <-
    data.frame(State = rep(paste("State",
                                 LETTERS[cell_states_selected]),
                           num_meta_cells))
rownames(column_annotation_complexheatmap_1) <-
    seq_len(nrow(column_annotation_complexheatmap_1))

ha1 <- HeatmapAnnotation(df = column_annotation_complexheatmap_1,
                         col = list(State = c(setNames(c("black",
                                                         "grey70",
                                                         "grey70",
                                                         "grey70",
                                                         "black"),
                                                       paste("State", LETTERS[cell_states_selected])))),
                         annotation_legend_param = list(State = list(title = "State",
                                                                     title_gp = gpar(fontsize = 8),
                                                                     labels_gp = gpar(fontsize = 7))))

column_annotation_complexheatmap_2 <-
    data.frame(Cell = rep("Epicardial", 3))

ha2 <- HeatmapAnnotation(df = column_annotation_complexheatmap_2,
                         col = list(Cell = c("Epicardial" = "#56B4E9")),
                         annotation_legend_param = list(Cell = list(title = "Cell",
                                                                    title_gp = gpar(fontsize = 8),
                                                                    labels_gp = gpar(fontsize = 7))))

genes_cluster_dend_ordered_lst <-
    sapply(c(6,
             1,
             3,
             5,
             2,
             4), function (x) {
        genes_in_cluster <- names(kmeans_out$cluster)[kmeans_out$cluster == x]
        dend <- hclust(dist(heatmap_mat_scaled_use[genes_in_cluster,]))

        dend$labels
    })

genes_cluster_dend_ordered <-
    unlist(genes_cluster_dend_ordered_lst)
heatmap_mat_scaled_use <- heatmap_mat_scaled_use[genes_cluster_dend_ordered,]

# set row annotation
row_annotation_complexheatmap <-
    data.frame(Pattern = rep(1:6, lapply(genes_cluster_dend_ordered_lst,
                                         length)))

ha_row <- rowAnnotation(
    df = row_annotation_complexheatmap,
    col = list(Pattern = c(setNames(RColorBrewer::brewer.pal(n = 11,
                                                             name = "Set3")[1:6],
                                    1:6))),
    width = unit(1, "mm"),
    annotation_legend_param = list(
        Pattern = list(title = "Pattern",
                       title_gp = gpar(fontsize = 8),
                       labels_gp = gpar(fontsize = 7))))

# get heatmap 1
hm1 <- Heatmap(heatmap_mat_scaled_use[, 1:485],
               cluster_columns = FALSE,
               cluster_rows = FALSE,
               show_row_dend = FALSE,
               show_column_names = FALSE,
               show_row_names = FALSE,
               col = circlize::colorRamp2(c(-2, 0, 2),
                                          c("#4575B4", "white", "#D73027")),
               heatmap_legend_param =list(title = "Expr",
                                          title_gp = gpar(fontfamily = "Arial",
                                                          # fontsize = 8),
                                                          fontsize = 6),
                                          legend_direction = "vertical",
                                          labels_gp = gpar(family = "Arial",
                                                           # fontsize = 7),
                                                           fontsize = 5),
                                          # legend_width = unit(2, "mm"),
                                          legend_height = unit(5, "mm")),
               top_annotation = ha1,
               top_annotation_height = unit(1, "mm"),
               width = 2)

# get heatmap 2
hm2 <- Heatmap(heatmap_mat_scaled_use[, 486:488],
               cluster_columns = FALSE,
               cluster_rows = FALSE,
               show_row_dend = FALSE,
               show_column_names = FALSE,
               show_row_names = FALSE,
               col = circlize::colorRamp2(c(-2, 0, 2),
                                          c("#4575B4", "white", "#D73027")),
               heatmap_legend_param = list(title = "Expr",
                                           title_gp = gpar(fontfamily = "Arial",
                                                           fontsize = 8),
                                           labels_gp = gpar(family = "Arial",
                                                            fontsize = 7),
                                           legend_width = unit(2, "mm"),
                                           legend_height = unit(2, "mm")),
               show_heatmap_legend = FALSE,
               top_annotation = ha2,
               top_annotation_height = unit(1, "mm"),
               width = .02)

# select genes to mark
genes_to_mark <- unique(c(genes_selected_10,
                          "ENSMUSG00000015627",
                          "ENSMUSG00000031517"))

heatmap_mat_scaled_subset_indices <-
    which(rownames(heatmap_mat_scaled_use) %in% genes_to_mark)
heatmap_mat_scaled_subset_indices_labels <-
    gene_symbols[rownames(heatmap_mat_scaled_use)][heatmap_mat_scaled_subset_indices]

marked_genes <-
    rowAnnotation(link = row_anno_link(at = heatmap_mat_scaled_subset_indices,
                                       labels = heatmap_mat_scaled_subset_indices_labels,
                                       side = "right",
                                       labels_gp = gpar(fontfamily = "Arial",
                                                        fontsize = 5),
                                       lines_gp = gpar(col = "grey50",
                                                       lwd = .5),
                                       padding = 1),
                  width = unit(2.0, "mm") +
                      max_text_width(heatmap_mat_scaled_subset_indices_labels,
                                     gp = gpar(fontsize = 6)))
# plot
draw(ha_row + hm1 + hm2 + marked_genes,
     gap = unit(c(1, 1, 1), "mm"))

Information about the current R session

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin18.2.0 (64-bit)
## Running under: macOS Mojave 10.14.3
## 
## Matrix products: default
## BLAS/LAPACK: /usr/local/Cellar/openblas/0.3.5/lib/libopenblas_haswellp-r0.3.5.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] grid      splines   stats4    parallel  stats     graphics  grDevices
##  [8] utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggrepel_0.8.0         extrafont_0.17        gridExtra_2.3        
##  [4] scales_1.0.0.9000     magrittr_1.5.0.9000   forcats_0.4.0.9000   
##  [7] stringr_1.4.0.9000    dplyr_0.8.0.9006      purrr_0.3.1          
## [10] readr_1.3.1.9000      tidyr_0.8.3.9000      tibble_2.0.1.9001    
## [13] tidyverse_1.2.1.9000  ComplexHeatmap_1.20.0 igraph_1.2.4         
## [16] monocle_2.10.1        DDRTree_0.1.5         irlba_2.3.3          
## [19] VGAM_1.1-1            ggplot2_3.1.0.9000    Biobase_2.42.0       
## [22] BiocGenerics_0.28.0   Matrix_1.2-15        
## 
## loaded via a namespace (and not attached):
##  [1] matrixStats_0.54.0   fs_1.2.6.9000        lubridate_1.7.4.9000
##  [4] RColorBrewer_1.1-2   httr_1.4.0.9000      docopt_0.6.1        
##  [7] tools_3.5.2          backports_1.1.3      R6_2.4.0            
## [10] DBI_1.0.0            lazyeval_0.2.1       colorspace_1.4-0    
## [13] GetoptLong_0.1.7     withr_2.1.2          tidyselect_0.2.5    
## [16] extrafontdb_1.0      compiler_3.5.2       cli_1.0.1           
## [19] rvest_0.3.2          xml2_1.2.0           labeling_0.3        
## [22] slam_0.1-45          digest_0.6.18        rmarkdown_1.11      
## [25] sparsesvd_0.1-4      pkgconfig_2.0.2      htmltools_0.3.6     
## [28] dbplyr_1.3.0         limma_3.38.3         rlang_0.3.1.9000    
## [31] GlobalOptions_0.1.0  readxl_1.3.0.9000    rstudioapi_0.9.0    
## [34] FNN_1.1.3            shape_1.4.4          generics_0.0.2      
## [37] combinat_0.0-8       jsonlite_1.6         Rcpp_1.0.0          
## [40] munsell_0.5.0        viridis_0.5.1        stringi_1.3.1       
## [43] yaml_2.2.0           Rtsne_0.15           plyr_1.8.4          
## [46] crayon_1.3.4         lattice_0.20-38      haven_2.1.0         
## [49] circlize_0.4.5       hms_0.4.2.9001       zeallot_0.1.0       
## [52] knitr_1.21           pillar_1.3.1.9000    rjson_0.2.20        
## [55] reshape2_1.4.3       reprex_0.2.1.9000    glue_1.3.0.9000     
## [58] evaluate_0.13        modelr_0.1.4         vctrs_0.1.0.9002    
## [61] Rttf2pt1_1.3.7       cellranger_1.1.0     gtable_0.2.0        
## [64] RANN_2.6.1           assertthat_0.2.0     xfun_0.5            
## [67] broom_0.5.1.9000     qlcMatrix_0.9.7      HSMMSingleCell_1.2.0
## [70] viridisLite_0.3.0    pheatmap_1.0.12      cluster_2.0.7-1     
## [73] fastICA_1.2-1        densityClust_0.3